### B Truesdale
### Replication and extension of Mouw & Kalleberg 2010

## -------------------------
## Reproducing M&K Table 4, Col 2: Multivariate Analysis with Fixed Effects
## -------------------------

## We begin with a file generated from the authors' Stata code.  Ted Mouw, the corresponding author, generously and promply provided his replication code by email upon request.

setwd("/Users/btrue/Desktop/tab4")
library(foreign)

impute_men_06 <- read.dta("temp_igls_1_2006_occ00.dta")
head(impute_men_06)
dim(impute_men_06)  # We have 366,859 obs x 44 variables

# --- Approach 1: 

## We run a fixed effects regression using dummy variables for the 501 occupations. Because this datafile contains five entries for each individual with an imputed wage value, and only one entry for each individual with an observed wage,  use weight = weight2

## This regression is too large to run on a laptop, but can be run on a computer with larger memory or on a server

test <- lm (lnwage ~ xgrade92_38 + xgrade92_40 + xgrade92_41 + xgrade92_42 + xgrade92_43 + xgrade92_44 + xgrade92_45 + xgrade92_46 + c1race_2 + c1race_3 + c1race_4 + c2age2_25 + c2age2_30 + c2age2_35 + c2age2_40 + c2age2_45 + c2age2_50 + c2age2_55 + c2age2_60 + c2age2_65 + c3unionmme_2 + as.factor(occ), data=impute_men_06, weights=weight2)
test.coef <-coef(summary.lm(test))
test.coef
write.csv(test.coef, file="LM-weights2.csv")
## This gives coefficients of similar magnitude and same sign to those in Table 4 in Mouw & Kalleberg. 

# --- Approach 2: Plyr

## An alternative, more computationally efficient approach is to use the package plyr to subtract the occupation-specific means from each variable

vars <-c("lnwage","xgrade92_38", "xgrade92_40", "xgrade92_41", "xgrade92_42", "xgrade92_43", "xgrade92_44", "xgrade92_45", "xgrade92_46", "c1race_2", "c1race_3", "c1race_4", "c2age2_25","c2age2_30", "c2age2_35", "c2age2_40", "c2age2_45", "c2age2_50", "c2age2_55", "c2age2_60", "c2age2_65", "c3unionmme_2", "occ") 
testA <- impute_men_06[vars]
dim(testA)  #366859 x 23
ls()
rm("impute_men_06")

testAa <- ddply(testA, .(occ), transform, 
lnwage.m = lnwage-mean(lnwage), 
xgrade92_38.m = xgrade92_38-mean(xgrade92_38), 
xgrade92_40.m = xgrade92_40-mean(xgrade92_40), 
xgrade92_41.m = xgrade92_41-mean(xgrade92_41),
xgrade92_42.m = xgrade92_42-mean(xgrade92_42), 
xgrade92_43.m = xgrade92_43-mean(xgrade92_43), 
xgrade92_44.m = xgrade92_44-mean(xgrade92_44), 
xgrade92_45.m = xgrade92_45-mean(xgrade92_45), 
xgrade92_46.m = xgrade92_46-mean(xgrade92_46),
c1race_2.m = c1race_2-mean(c1race_2), 
c1race_3.m = c1race_3-mean(c1race_3),
c1race_4.m = c1race_4-mean(c1race_4), 
c2age2_25.m = c2age2_25-mean(c2age2_25),
c2age2_30.m = c2age2_30-mean(c2age2_30), 
c2age2_35.m = c2age2_35-mean(c2age2_35),
c2age2_40.m = c2age2_40-mean(c2age2_40),
c2age2_45.m = c2age2_45-mean(c2age2_45), 
c2age2_50.m = c2age2_50-mean(c2age2_50), 
c2age2_55.m = c2age2_55-mean(c2age2_55), 
c2age2_60.m = c2age2_60-mean(c2age2_60), 
c2age2_65.m = c2age2_65-mean(c2age2_65),
c3unionmme_2.m = c3unionmme_2-mean(c3unionmme_2),
.progress = "text")  

dim(testAa)
head(testAa)
write.csv(testAa, file="testAa-meanadj.csv")

testAa.ols <- lm(lnwage.m ~ xgrade92_38.m + xgrade92_40.m + xgrade92_41.m + xgrade92_42.m + xgrade92_43.m + xgrade92_44.m + xgrade92_45.m + xgrade92_46.m + c1race_2.m + c1race_3.m + c1race_4.m + c2age2_25.m + c2age2_30.m + c2age2_35.m + c2age2_40.m + c2age2_45.m + c2age2_50.m + c2age2_55.m + c2age2_60.m + c2age2_65.m + c3unionmme_2.m, data = testAa, weights=weight2)

testAa.coef <-coef(summary.lm(testAa.ols))
testAa.coef
write.csv(testAa.coef, file="OLS after Plyr.csv")

# The results are exactly the same as OLS with 500 dummy variables on the RCE (approach 1 above).


## -------------------------
## Reproducing M&K Table 4, Col 4: Variance of Log Wages
## -------------------------

# Create the X, Y, and Z using the testAa data (these have the occ-specific means subtracted).  Attempting to run the optimization directly on the RCE (with effectively 1000+ variables proved all but impossible)

x.vars <-c("xgrade92_38.m", "xgrade92_40.m", "xgrade92_41.m", "xgrade92_42.m", "xgrade92_43.m", "xgrade92_44.m", "xgrade92_45.m", "xgrade92_46.m", "c1race_2.m", "c1race_3.m", "c1race_4.m", "c2age2_25.m","c2age2_30.m", "c2age2_35.m", "c2age2_40.m", "c2age2_45.m", "c2age2_50.m", "c2age2_55.m", "c2age2_60.m", "c2age2_65.m", "c3unionmme_2.m") 
x.var.mat <-testAa[x.vars]
x.mat <-cbind(1, as.matrix(x.var.mat))
head(x.mat)
dim(x.mat)  #333859 x 22 variables incl intercept
class(x.mat)

y.vec <-testAa$lnwage.m

## The function.  Note that sigma2 is reparamerized so that it can vary according to a set of variables
ll.normal.vary <- function (par, X, Y, Z) {
	beta <-par[1:ncol(X)]
	gamma <- par[(ncol(X)+1):(ncol(X)+ncol(Z))]
	-1/2* sum((Z %*% gamma) + ((Y - X %*% beta)^2)/exp(Z %*% gamma))
}

## An equivalent formulation of the function:
# ll.vary.2 <- function(par,Y,X,Z){
#  beta <- par[1:ncol(X)]
#  sigma2 <- exp(Z%*%par[(ncol(X)+1):length(par)]) 
#  return(-1/2 * (sum(log(sigma2)+ (Y -(X%*%beta))^2/sigma2)))
# }

# Optimize.  

start1 <- rep(0.1, ncol(x.mat)+ncol(x.mat))

v.optim1 <- optim (par = start1, fn=ll.normal.vary, X=x.mat, Y=y.vec, Z=x.mat, method = "BFGS", hessian = F, control = list(fnscale = -1))  
v.optim1$par  #44 coefficients.  
v.optim1$value

#Output table
x.names <-colnames(x.mat)
varnames <- c(x.names, x.names)
beta.gamma.table<-cbind(varnames, round(v.optim1$par, digits=3))
write.csv(beta.gamma.table, file="beta_gamma_table.csv")

# Need to retransform gammas to sigma2

gammas <-v.optim1$par[(ncol(x.mat)+1):(ncol(x.mat)+ncol(x.mat))]
gammas

sigma2 <- exp(x.mat %*% gammas)
dim(sigma2)
# This gives individual sigma2 values 

### I believe that sigma2 is the square of the residual in M&K terms, which is the dependent variable for Col4:

varAa <- cbind(sigma2, testAa)

varAa.ols <- lm(sigma2 ~ xgrade92_38.m + xgrade92_40.m + xgrade92_41.m + xgrade92_42.m + xgrade92_43.m + xgrade92_44.m + xgrade92_45.m + xgrade92_46.m + c1race_2.m + c1race_3.m + c1race_4.m + c2age2_25.m + c2age2_30.m + c2age2_35.m + c2age2_40.m + c2age2_45.m + c2age2_50.m + c2age2_55.m + c2age2_60.m + c2age2_65.m + c3unionmme_2.m, data = varAa)

varAa.coef <-coef(summary.lm(varAa.ols))
varAa.coef  #These are about the right magnitude (though some are twice the size).  
write.csv(varAa.coef, file="Var testAa.csv")


## -------------------------
## INTERACTION OF AGE AND OCCUPATION
## Does the relationship between age and predicted wage differ across broad occupational categories?
## -------------------------

## We begin this section with the same data file used from the start, and correct the union variable coding, merge with Census occ codes, and choose the needed variables

library(foreign)
library(Zelig)
library(plyr)

summary(impute_men_06$c3unionmme_2) #mean = .86.  This is coded backward. 
dim(impute_men_06)  # We have 366,859 obs x 44 variables

## Add the Census occupational codes to the dataframe
setwd("/Users/btruesdale/Dropbox/Repro paper/Revisions")
censxsoc <- read.csv("censxsoc.csv")
head(censxsoc)
censvars <- c("occ00", "soccode_lg", "soctitle_lg")
censxsoc.sub <- unique(censxsoc[censvars])
head(censxsoc.sub)
dim(censxsoc.sub)  #506 occs

impute_men_06_b <-merge(impute_men_06, censxsoc.sub, by="occ00")
head(impute_men_06_b)
table(censxsoc$soctitle_lg) #23 large occupational categories
dim(impute_men_06_b) #366859 obs

## Subset the file so it contains only the necessary variables
ageocc.vars <-c("lnwage","grade92", "race", "age2", "c3unionmme_2", "soccode_lg", "soctitle_lg", "weight2")
ageocc06.data <-impute_men_06_b[ageocc.vars]
head(ageocc06.data)

## Recode the union variable correctly
ageocc06.data$union <- ageocc06.data$c3unionmme_2 +1
ageocc06.data$union[ageocc06.data$union == 2] <- 0
summary(ageocc06.data)
ageocc06.data$c3unionmme_2 <- NULL

## Save this data file separately
write.csv(ageocc06.data, file = "ageocc06.data.csv")

## -------------------------
## Age-Wage Profile for each of the 22 major civilian occupational categories
## Regression and Plot - 22 times
## -------------------------

## If you are starting here, then load the reduced file:

# setwd("C:/Users/btruesdale/Dropbox/Repro paper/Revisions")
# ageocc06.data <- read.csv("ageocc06.data.csv")
# head(ageocc06.data)

# library(foreign)
# library(Zelig)
# library(plyr)

class(ageocc06.data$grade92) #integer.  Reconvert the variable classes to factor
ageocc06.data$grade92 <- as.factor(ageocc06.data$grade92)
ageocc06.data$age2 <- as.factor(ageocc06.data$age2)

ageocc06.data$soclabels <-factor(ageocc06.data$soccode_lg, labels = c("11-Management", "13-Business & Financial", "15-Computer & Mathematical", "17-Architec. & Engineering", "19-Science", "21-Community & Social Services", "23-Legal", "25-Education & Library", "27-Arts, Design, Sports, Media", "29-Healthcare Practitioners", "31-Healthcare Support", "33-Protective Service", "35-Food Preparation & Serving", "37-Bldg & Grounds Maintenance", "39-Personal Care & Service", "41-Sales & Related", "43-Office & Admin. Support", "45-Farming, Fishing, Forestry", "47-Construction & Extraction", "49-Install., Maintenance, Repair", "51-Production", "53-Transportation"))
## Note I took out: 
# "Entertainment" from Arts etc, 
# "Cleaning &" from Building & Grounds Maintenance
# "Life, Physical, & Social" from Science
# "& Training" from Education
# & Technical from Healthcare Practitioners

levels(ageocc06.data$soclabels)

# ----------------------- Zelig ---------------------------

z.out <- zelig(lnwage ~ grade92 + race + age2 + union + soclabels + age2*soclabels, data=ageocc06.data, model="ls", weights=ageocc06.data$weight2)
summary(z.out)

################
##  This section uses Zelig setx and sim commands to set a series of X variables, then simulate expected values.  The goal is to have 22 plots of expected wages vs age, one plot for each of the occupational categories represented in soccode_lg and soctitle_lg.  Thanks to Steve Worthington at IQSS for advice on lapply and loops.


# --- Now to create the sets of X's ---
## Set X for high school graduates, white, non-union, range of ages, for each soccode_lg.  

## Example: this is what we want to do for each occ category
# x.49 <- setx(z.out, grade92=39, race="white", union=0, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65), soccode_lg="49-0000")

## Iterate through all levels of soccode_lg (both versions are equally as fast)

## Define the function and use lapply:  
setFun <- function(x) setx(z.out, grade92=39, race="white", union=0, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65), soclabels = x)
x.num <- lapply(z.out$xlevels$soclabels, setFun)
class(x.num)  # List

## An alternative coding using plyr (same speed as lapply):
# x.num <- llply(z.out$xlevels$soclabels, function(x) {
#    setx(z.out, grade92=39, race="white", union=0, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65), soclabels = x)
# }) 

## --- Simulate expected values ---

## Example: do this for each occ category
# s.out.49 <-sim(z.out, x.49)
# summary(s.out.49)
# summary(s.out.49$qi$ev)

## Define the function and use lapply: 
s.out <- lapply(x.num, function(x) {
    sim(z.out, x = x)
})


## --- Calculate mean and CI for each occ category ---

## Example: for each occ category, create the matrix of simulated expected values, and calculate mean and CI
# ev.mat.49 <- exp(s.out.49$qi$ev) 
# head(ev.mat.49)
# ev.mean.49 <-apply(ev.mat.49, 2, mean)
# ev.mean.49
# ev.bounds.49 <- t(apply(ev.mat.49, 2, function(x) quantile(x, c(.025, .975))))
# ev.bounds.49


## Iterate through matrices of simulated expected values, and calculate mean and CI
ev.mat <- vector(mode = "list", length = length(s.out))
ev.mean <- vector(mode = "list", length = length(ev.mat))
ev.bounds <- vector(mode = "list", length = length(ev.mean))
ev.bounds025 <- vector(mode = "list", length = length(ev.mean))
ev.bounds975 <- vector(mode = "list", length = length(ev.mean))
for (i in seq_along(s.out)) {
    ev.mat[[i]] <- exp(s.out[[i]]$qi$ev)
    ev.mean[[i]] <- apply(ev.mat[[i]], 2, mean, na.rm = TRUE)
    ev.bounds[[i]] <- t(apply(ev.mat[[i]], 2, function(x) quantile(x, c(.025, .975))))
    ev.bounds025[[i]] <- apply(ev.mat[[i]], 2, function(x) quantile(x, .025))
    ev.bounds975[[i]] <- apply(ev.mat[[i]], 2, function(x) quantile(x, .975))
}

## --- Plot ---

## Example: for each occ category, plot mean and bounds:
# plot(x=seq(from=20, to=65, by=5), y=ev.mean.49, type="l", lwd=2,
# xlab="Worker's Age",
# ylab="Expected Hourly Wage",
# main = "Installation, Maintenance, and Repair Occupations (49-0000)", 
# ylim = c(10, 25))
# lines(seq(from=20, to=65, by=5), ev.bounds.49[,1], lwd=1.75, col="red", lty=2)
# lines(seq(from=20, to=65, by=5), ev.bounds.49[,2], lwd=1.75, col="red", lty=2)


## Plot using ggplot.  ggplot prefers "long" shape dataframes:
library(reshape2)
# ev.mean dataframe long
ev.mean.df <- as.data.frame(do.call(rbind, ev.mean))
ev.mean.df.long <- melt(t(ev.mean.df))
ev.mean.df.long$age <- rep(seq(from=20, to=65, by=5), length(ev.mean))
ev.mean.df.long$soclabels <- rep(levels(ageocc06.data$soclabels), each = 10)
ev.mean.df.long$soclabels <- rep(levels(ageocc06.data$soclabels), each = 10)
# ev.bounds025 dataframe long
ev.bounds.df.025 <- as.data.frame(do.call(rbind, ev.bounds025))
ev.bounds.df.025.long <- melt(t(ev.bounds.df.025))
ev.bounds.df.025.long$age <- rep(seq(from=20, to=65, by=5), length(ev.bounds025))
ev.bounds.df.025.long$soclabels <- rep(levels(ageocc06.data$soclabels), each = 10)
ev.bounds.df.025.long$soclabels <- rep(levels(ageocc06.data$soclabels), each = 10)
# ev.bounds975 dataframe long
ev.bounds.df.975 <- as.data.frame(do.call(rbind, ev.bounds975))
ev.bounds.df.975.long <- melt(t(ev.bounds.df.975))
ev.bounds.df.975.long$age <- rep(seq(from=20, to=65, by=5), length(ev.bounds975))
ev.bounds.df.975.long$soclabels <- rep(levels(ageocc06.data$soclabels), each = 10)
ev.bounds.df.975.long$soclabels <- rep(levels(ageocc06.data$soclabels), each = 10)
# plot
library(ggplot2)
facet_plot <- ggplot(data = ev.mean.df.long, aes(x = age, y = value)) +
    facet_wrap(~ soclabels, ncol = 5) +
    geom_line(colour="blue") +
    geom_line(data = ev.bounds.df.025.long, aes(x = age, y = value), colour="red", linetype = 2) +
    geom_line(data = ev.bounds.df.975.long, aes(x = age, y = value), colour="red", linetype = 2) +
    labs(x="Worker's Age", y="Expected Hourly Wage") +
    theme_bw() +
    opts(title = "Age-Wage Profiles for Men in the Large SOC Occupational Categories (2006-07)",
	   strip.text.x = theme_text(size=8),
         plot.title = theme_text(size = 15, vjust = 1),
         axis.title.x = theme_text(size = 13, hjust = 0.53, , vjust = -0.2),
         axis.title.y = theme_text(size = 13, angle = 90, hjust = 0.52, vjust = 0.2)
         )
ggsave(facet_plot, filename = "Age-Wage_facet_plot3.pdf", width=10, height=11)


## --- Note for a next step ---

## Could create the same series of 22 plots, but instead of using grade92=39 (ie high school) every time, substitute the modal value of education for each soccode_lg in each plot.  For example, the modal education level (grade92) for Management Occupations (soccode_lg = 11-0000) is 43, ie college graduate.  So instead of setting x.11 above with grade92 = 39, we'd set x.11 with grade92=43. 

## This generates the modal education value for each of the 22 occ groups:

## Modal education values for each occ category
# install.packages("modeest") # do this once
library(modeest)

mlv(ageocc06.data$grade92, method="mfv")$M
class(educmode.all)

educmode.all <- mlv(ageocc06.data$grade92, method="mfv")$M # The modal education value for the whole dataset is 39, ie high school graduate

occ.ed.mode.calc <- function(X) {
	occ.mode <- mlv(X$grade92, method="mfv")$M
	}

educmode<- ddply(ageocc06.data, .(soclabels), occ.ed.mode.calc)
educmode  # The modal education level for each of the 22 occ groups
class(educmode)
write.csv(educmode, file = "educmode.csv")


### ------------------------
## Age-Wage Profile for Skilled Manual Workers ("young men's jobs", ymj)
### ------------------------

## --- File Setup, part 1: choosing the set of Skilled Manual jobs ---
## The Chosen Occ Descriptions.csv file is generated by choosing occupations according to certain task-specific characteristics from O*NET, the Occupational Information Network, available publicly online.  The file Chosen Occ Descriptions.csv is made available on Dataverse.  What follows is the code used to construct this list from O*NET.  Thanks to Diana Draghici for help with subsetting and merging. 

## O*NET
## Selecting a set of occupational codes based on particular values of skills and abilities.  Note that some of the code is redundant - this file shouldn't be run as a whole 
## March 2012

### Set working directory to location where files are saved
setwd("/Users/btrue/Desktop/IDEAS/YMJ/ONET Info/db_16_0")

### Read in data
Abilities <- read.csv("Abilities.csv", sep=",", header=TRUE)
head(Abilities)

Skills <- read.csv("Skills.csv", sep=",", header=TRUE)
head (Skills)
class (Skills)

occdata <- read.csv("Occupation Data2.csv", sep=",", header = TRUE)
head(occdata)
dim(occdata)
tail(occdata)

### Select rows from Skills that match specified criteria:
    ## 1) In Skills, the importance of Critical Thinking is >= 3.0 (that is: 
         # (a) column "Element Name," contains "Critical Thinking"; and
         # (b) column "Scale ID" contains "IM," and
         # (c) column Data Value is >= 3.0)
    ## 2) In Skills, the importance of Complex Problem Solving is >= 3.0 (same thinking as in #1)

subset_S1 <- subset(Skills, Skills[,"Element.Name"]=="Critical Thinking" &
Skills[,"Scale.ID"]=="IM"  & Skills[,"Data.Value"]>= 3.0)
head(subset_S1)

subset_S2 <- subset(Skills, Skills[,"Element.Name"]=="Complex Problem Solving" &
Skills[,"Scale.ID"]=="IM"  & Skills[,"Data.Value"]>= 3.0)
head(subset_S2)


### Select rows from Abilities that match specified criteria:
    ## 3) In Abilities, the importance of Manual Dexterity is >= 3.0
    ## 4) In Abilities, the importance of Trunk Strength is >= 3.0

subset_A3 <- subset(Abilities, Abilities[,"Element.Name"]=="Manual Dexterity" &
Abilities[,"Scale.ID"]=="IM"  & Abilities[,"Data.Value"]>= 3.0)
head(subset_A3)

subset_A4 <- subset(Abilities, Abilities[,"Element.Name"]=="Trunk Strength" &
Abilities[,"Scale.ID"]=="IM"  & Abilities[,"Data.Value"]>= 3.0)
head(subset_A4)

### Concatenate subsets vertically - this is just if you want the full list of occs that have any one of the four characteristics
new.subset <- rbind(subset_S1, subset_S2, subset_A3, subset_A4)

### A partial merge
merge.S2.A4 <-merge(subset_S2, subset_A4, by="O.NET.SOC.Code")
head(merge.S2.A4)

### The full merge
merge.skills <-merge(subset_S1, subset_S2, by="O.NET.SOC.Code")

merge.ability <-merge(subset_A3, subset_A4, by="O.NET.SOC.Code")

merge.all <-merge(merge.skills, merge.ability, by ="O.NET.SOC.Code")
head(merge.all)

### Export dataset containing relevant occupation codes to .csv format 
write.csv(merge.all, file = "OccupationCodes.csv")
# This delivers 76 occupations

### Pulling out the column of SOC codes only
soc.codes <- merge.all["O.NET.SOC.Code"]
head(soc.codes)

occ.descrip<-merge(soc.codes, occdata, by="O.NET.SOC.Code")
write.csv(occ.descrip, file = "Chosen Occ Descriptions.csv")



## --- File Setup, part 2: crosswalks ---

ymj <- read.csv("Chosen Occ Descriptions.csv")
head(ymj)

ymj$soccode.dup <- as.character(ymj$O.NET.SOC.Code)
class(ymj$soccode.dup)
head(ymj$soccode.dup)
length(ymj$soccode.dup)  #76

## Generate the 2010 SOC codes, which are just the O.NET code minus the final .XX
test.split <-unlist(strsplit(ymj$soccode.dup, "\\."))
head(test.split)

logical.rep <-rep(c("TRUE", "FALSE"), 76)

ymj$soccode.2010 <- test.split[as.logical(logical.rep)]
head(ymj)

ymj$soccode.dup <-NULL
dim(ymj) #76 occs

# Take out the detailed codes and remove duplicates to prepare for merge - choose soccode.2010 only
ymj.mg.var = "soccode.2010"
ymj.mg <-unique(ymj[ymj.mg.var])
head(ymj.mg)
dim(ymj.mg) #70 occs

## Now we need to crosswalk soccode.2010 to soccode (ie, the 2000 SOC codes)

xwalk.00.10 <- read.csv("soc_2000_to_2010_crosswalk.csv")
head(xwalk.00.10)
xwalk.colnames <- c("soccode", "2000.SOC.title", "soccode.2010", "2010.SOC.title")
colnames(xwalk.00.10) <-xwalk.colnames
head(xwalk.00.10)
dim(xwalk.00.10) #1441 occs
length(unique(xwalk.00.10$soccode))  #822 occs
length(unique(xwalk.00.10$soccode.2010)) #842 occs


ymj2 <- merge(ymj.mg, xwalk.00.10, by="soccode.2010")
head(ymj2)  #This has the original O.NET code, the 2010 SOC code, and the 2000 SOC code.
dim(ymj2) #71 occs

## Next crosswalk soccode (2000) to occ00, the variable contained in the NBER files, as we did for the full data at the top of this file
# Note that you don't need to open censxsoc again if you ran this file from the top
# censxsoc <- read.csv("censxsoc.csv")
head(censxsoc)
ymj.soc.var <- c("occ00", "soccode")
censxsoc.ymj <- censxsoc[ymj.soc.var]
head(censxsoc.ymj)
dim(censxsoc.ymj) #798
length(unique(censxsoc.ymj$soccode)) #798

ymj3 <-merge(ymj2, censxsoc.ymj, by="soccode")
head(ymj3)
dim(ymj3)  # 70 occs, good
length(unique(ymj3$soccode)) # Down to 70 occupations in the 2000 SOC codes (eg "Acute Care Nurses" become "Registered Nurses")


## And now pull out only those individuals in the imputed 2006-07 data who work in ymj
# Note that you don't need to open impute_men_06 again if you ran this file from the top
#setwd("E:/The real thing/Source files for Table 4")
# impute_men_06 <- read.dta("temp_igls_1_2006_occ00.dta")
# setwd("E:/The real thing/Revisions")
head(impute_men_06)

ymj.full <-merge(impute_men_06, ymj3, by="occ00")
head(ymj.full)
dim(ymj.full)  #53321 x 48
length(unique(ymj.full$soccode))  #Good, still 70 soccodes
check <-as.matrix(unique(ymj.full$soccode))
check  # This is the list of soccodes

# Subset the file so it contains only the necessary variables
ymj.vars <-c("lnwage","grade92", "race", "age2", "c3unionmme_2", "soccode", "weight2", "wage_re")
ymj.short <- ymj.full[ymj.vars]
head(ymj.short)
summary(ymj.short$wage_re)  #38,300 NA's of 53321 obs = 15021 real people with observed wages and 7660 with imputed wages


# Recode the union variable
ymj.short$union <- ymj.short$c3unionmme_2 +1
ymj.short$union[ymj.short$union == 2] <- 0
summary(ymj.short)
ymj.short$c3unionmme_2 <- NULL


## Save this file
write.csv(ymj.short, file = "ymj_short.csv")

## --- Analysis (YMJ) ---

ymj.short$grade92 <- as.factor(ymj.short$grade92)
ymj.short$age2 <- as.factor(ymj.short$age2)
summary(ymj.short)  #Note the original file will all occs is about 13.7% union; this is 18.1% union - but note that this is calculated on the basis of overcounting the imputed ones.
# Relatively small percentage are college grads.  HS grads are normative.

z.out.ymj <- zelig(lnwage ~ grade92 + race + age2 + union, data=ymj.short, model="ls", weights = ymj.short$weight2)
summary(z.out.ymj)

x.out.ymj <- setx(z.out.ymj, grade92=39, race="white", union=0, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65))

s.out.ymj <-sim(z.out.ymj, x.out.ymj)
summary(s.out.ymj$qi$ev)

## Calculate mean and CI

ev.mat.ymj <- exp(s.out.ymj$qi$ev) 
ev.mean.ymj <-apply(ev.mat.ymj, 2, mean)
ev.mean.ymj
ev.bounds.ymj <- t(apply(ev.mat.ymj, 2, function(x) quantile(x, c(.025, .975))))
ev.bounds.ymj

## Plot 

plot(x=seq(from=20, to=65, by=5), y=ev.mean.ymj, type="l", lwd=2, col="blue",
xlab="Worker's Age",
ylab="Expected Hourly Wage",
main = "Age-Wage Profiles for Men in Skilled Manual Occupations (2006-07)", 
 ylim = c(10, 25))
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj[,1], lwd=1.75, col="red", lty=2)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj[,2], lwd=1.75, col="red", lty=2)




##################
## Do it again by union membership
# Starting with the z.out results above:

x.out.ymj.union <- setx(z.out.ymj, grade92=39, race="white", union=1, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65))
x.out.ymj.nonunion <- setx(z.out.ymj, grade92=39, race="white", union=0, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65))

s.out.ymj.union <-sim(z.out.ymj, x.out.ymj.union)
s.out.ymj.nonunion <-sim(z.out.ymj, x.out.ymj.nonunion)


## Calculate mean and CI

ev.mat.ymj.union <- exp(s.out.ymj.union$qi$ev) 
ev.mean.ymj.union <-apply(ev.mat.ymj.union, 2, mean)
ev.mean.ymj.union
ev.bounds.ymj.union <- t(apply(ev.mat.ymj.union, 2, function(x) quantile(x, c(.025, .975))))
ev.bounds.ymj.union

ev.mat.ymj.nonunion <- exp(s.out.ymj.nonunion$qi$ev) 
ev.mean.ymj.nonunion <-apply(ev.mat.ymj.nonunion, 2, mean)
ev.mean.ymj.nonunion
ev.bounds.ymj.nonunion <- t(apply(ev.mat.ymj.nonunion, 2, function(x) quantile(x, c(.025, .975))))
ev.bounds.ymj.nonunion

## Plot 

plot(x=seq(from=20, to=65, by=5), y=ev.mean.ymj.nonunion, type="l", lwd=2, col="blue",
xlab="Worker's Age",
ylab="Expected Hourly Wage",
cex.lab=1.4,
main = "Age-Wage Profiles for Men in Skilled Manual Occupations (2006-07)", 
 ylim = c(10, 25))
 abline(v=(seq(20,60,10)), col="lightgray", lty=1, lwd=.25)
 abline(h=(seq(10,25,5)), col="lightgray", lty=1, lwd=.25)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.nonunion[,1], lwd=1.75, col="darkblue", lty=2)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.nonunion[,2], lwd=1.75, col="darkblue", lty=2)
 lines(seq(from=20, to=65, by=5), ev.mean.ymj.union, lwd=2, col="darkolivegreen", lty=1)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.union[,1], lwd=1.75, col="darkgreen", lty=2)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.union[,2], lwd=1.75, col="darkgreen", lty=2)
legend(40, 13,
	c("Union members", "Non-union members"),
	lty = c(1,1),
	lwd = c(2, 2), 
	col = c("darkolivegreen", "blue"),
	bg="white",
	cex=1.4)


## Note: for finding nice colors
# colors()[grep("green", colors())]


###############
## Does union membership make a difference at different ages? 

z.out.ymj.uint <- zelig(lnwage ~ grade92 + race + age2 + union + age2*union, data=ymj.short, model="ls")
summary(z.out.ymj.uint)

x.out.ymj.uint <- setx(z.out.ymj.uint, grade92=39, race="white", union=1, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65))
x.out.ymj.nonuint <- setx(z.out.ymj.uint, grade92=39, race="white", union=0, age2=c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65))

s.out.ymj.uint <-sim(z.out.ymj.uint, x.out.ymj.uint)
s.out.ymj.nonuint <-sim(z.out.ymj.uint, x.out.ymj.nonuint)

ev.mat.ymj.uint <- exp(s.out.ymj.uint$qi$ev) 
ev.mean.ymj.uint <-apply(ev.mat.ymj.uint, 2, mean)
ev.bounds.ymj.uint <- t(apply(ev.mat.ymj.uint, 2, function(x) quantile(x, c(.025, .975))))

ev.mat.ymj.nonuint <- exp(s.out.ymj.nonuint$qi$ev) 
ev.mean.ymj.nonuint <-apply(ev.mat.ymj.nonuint, 2, mean)
ev.bounds.ymj.nonuint <- t(apply(ev.mat.ymj.nonunion, 2, function(x) quantile(x, c(.025, .975))))


plot(x=seq(from=20, to=65, by=5), y=ev.mean.ymj.nonuint, type="l", lwd=2, col="blue",
xlab="Worker's Age",
ylab="Expected Hourly Wage",
main = "Age-Wage Profiles for Men in Skilled Manual Occupations (2006-07)", 
 ylim = c(10, 25))
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.nonuint[,1], lwd=1.75, col="darkblue", lty=2)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.nonuint[,2], lwd=1.75, col="darkblue", lty=2)
 lines(seq(from=20, to=65, by=5), ev.mean.ymj.uint, lwd=2, col="darkolivegreen", lty=1)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.uint[,1], lwd=1.75, col="darkgreen", lty=2)
 lines(seq(from=20, to=65, by=5), ev.bounds.ymj.uint[,2], lwd=1.75, col="darkgreen", lty=2)

## Answer: No, the union premium looks pretty much the same at all ages.  The t tests show each of the union*age interactions as not significant, but the simulated QOI do as good a job as an F test of saying whether they're really different overall


## AVERAGE PERCENT OF MEN IN THE OCCUPATION, 2006-07

library(foreign)
library(plyr)
setwd("/Users/btrue/Desktop/The real thing/NBER data")

morg06 <- read.dta("morg06.dta")
morg07 <- read.dta("morg07.dta")
dim(morg06)  #320551
dim(morg07)  #318207

head(morg06)
sexocc.vars <- c("sex", "occ00")
morg06.sub <-morg06[sexocc.vars]
head(morg06.sub)
morg06.sub <- na.omit(morg06.sub)
dim(morg06.sub) #220469

morg07.sub <-morg07[sexocc.vars]
morg07.sub <- na.omit(morg07.sub)
dim(morg07.sub) #218141

# Match to CPS categories
setwd("/Users/btrue/Dropbox/Repro paper/Revisions")
censxsoc <- read.csv("censxsoc.csv")
head(censxsoc)
censvars <- c("occ00", "soccode_lg", "soctitle_lg")
censxsoc.sub <- unique(censxsoc[censvars])
head(censxsoc.sub)
dim(censxsoc.sub)  #506 occs

morg06.mg <- merge(morg06.sub, censxsoc.sub, by="occ00")
morg06.mg$soclabels <-factor(morg06.mg$soccode_lg, labels = c("11-Management", "13-Business & Financial", "15-Computer & Mathematical", "17-Architec. & Engineering", "19-Science", "21-Community & Social Services", "23-Legal", "25-Education & Library", "27-Arts, Design, Sports, Media", "29-Healthcare Practitioners", "31-Healthcare Support", "33-Protective Service", "35-Food Preparation & Serving", "37-Bldg & Grounds Maintenance", "39-Personal Care & Service", "41-Sales & Related", "43-Office & Admin. Support", "45-Farming, Fishing, Forestry", "47-Construction & Extraction", "49-Install., Maintenance, Repair", "51-Production", "53-Transportation", "55-Military"))
## Note I took out: 
# "Entertainment" from Arts etc, 
# "Cleaning &" from Building & Grounds Maintenance
# "Life, Physical, & Social" from Science
# "& Training" from Education
# & Technical from Healthcare Practitioners

# Recode gender variable so male = 1 and female = 0
morg06.mg$sex.re <- morg06.mg$sex
morg06.mg$sex.re[morg06.mg$sex == 2] <-0

head(morg06.mg)
dim(morg06.mg)

sex.avg.calc <- function(X) {
	mean(X$sex.re)
	}
pctmen06 <- ddply(morg06.mg, .(soclabels), sex.avg.calc)
pctmen06
write.csv(pctmen06, "pctmen06.csv")
